{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Action2使用DeepFM对movielens进行评分预测\n",
    "\n",
    "1、使用DeepFM工具进行评分预测，代码正确，可以跑通简单的movielens_sample数据集（10points）    \n",
    "\n",
    "2、可以跑通完整的movielens数据集（10points）    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## movielens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "from sklearn import metrics\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "from deepctr.models import DeepFM\n",
    "from deepctr.feature_column import SparseFeat,get_feature_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"ratings.csv\")\n",
    "sparse_features = [\"userId\", \"movieId\", 'timestamp']\n",
    "target = ['rating']\n",
    "\n",
    "# 对特征标签进行编码\n",
    "for feature in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data[feature] = lbe.fit_transform(data[feature])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计算每个特征中的 不同特征值的个数\n",
    "fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]\n",
    "linear_feature_columns = fixlen_feature_columns\n",
    "dnn_feature_columns = fixlen_feature_columns\n",
    "feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将数据集切分成训练集和测试集\n",
    "train, test = train_test_split(data, test_size=0.2)\n",
    "train_model_input = {name:train[name].values for name in feature_names}\n",
    "test_model_input = {name:test[name].values for name in feature_names}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2622/2622 [==============================] - 132s 50ms/step - loss: 0.9398 - mse: 0.9333 - val_loss: 0.7576 - val_mse: 0.7441\n"
     ]
    }
   ],
   "source": [
    "# 使用DeepFM进行训练\n",
    "model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[4.436012 ],\n",
       "       [3.9129455],\n",
       "       [4.157041 ],\n",
       "       ...,\n",
       "       [3.0135837],\n",
       "       [3.823564 ],\n",
       "       [3.8273492]], dtype=float32)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 使用DeepFM进行预测\n",
    "pred_ans = model.predict(test_model_input, batch_size=256)\n",
    "pred_ans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test RMSE 0.8608716512930368\n"
     ]
    }
   ],
   "source": [
    "# 输出RMSE或MSE\n",
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## movielens_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000283E1F9F5E0>, embedding_name='movie_id', group_name='default_group', trainable=True), SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000283E1F9F7C0>, embedding_name='user_id', group_name='default_group', trainable=True), SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000283E1F9F730>, embedding_name='gender', group_name='default_group', trainable=True), SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000283E1F9F280>, embedding_name='age', group_name='default_group', trainable=True), SparseFeat(name='occupation', vocabulary_size=20, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000283E1F9FEB0>, embedding_name='occupation', group_name='default_group', trainable=True), SparseFeat(name='zip', vocabulary_size=188, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x00000283E1F9F460>, embedding_name='zip', group_name='default_group', trainable=True)]\n",
      "1/1 [==============================] - 1s 756ms/step - loss: 13.8281 - mse: 13.8281 - val_loss: 15.1416 - val_mse: 15.1416\n",
      "test RMSE 3.7609573249373622\n"
     ]
    }
   ],
   "source": [
    "#数据加载\n",
    "data = pd.read_csv(\"movielens_sample.txt\")\n",
    "sparse_features = [\"movie_id\", \"user_id\", \"gender\", \"age\", \"occupation\", \"zip\"]\n",
    "target = ['rating']\n",
    "\n",
    "# 对特征标签进行编码\n",
    "for feature in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data[feature] = lbe.fit_transform(data[feature])\n",
    "# 计算每个特征中的 不同特征值的个数\n",
    "fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]\n",
    "print(fixlen_feature_columns)\n",
    "linear_feature_columns = fixlen_feature_columns\n",
    "dnn_feature_columns = fixlen_feature_columns\n",
    "feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)\n",
    "\n",
    "# 将数据集切分成训练集和测试集\n",
    "train, test = train_test_split(data, test_size=0.2)\n",
    "train_model_input = {name:train[name].values for name in feature_names}\n",
    "test_model_input = {name:test[name].values for name in feature_names}\n",
    "\n",
    "# 使用DeepFM进行训练\n",
    "model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')\n",
    "model.compile(\"adam\", \"mse\", metrics=['mse'], )\n",
    "history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )\n",
    "# 使用DeepFM进行预测\n",
    "pred_ans = model.predict(test_model_input, batch_size=256)\n",
    "# 输出RMSE或MSE\n",
    "mse = round(mean_squared_error(test[target].values, pred_ans), 4)\n",
    "rmse = mse ** 0.5\n",
    "print(\"test RMSE\", rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
